{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import csv\n",
    "\n",
    "dataset_dir = '/beegfs/work/AudioSet'\n",
    "data_dir = os.path.join(dataset_dir, 'data')\n",
    "audio_ext = '.flac'\n",
    "video_ext = '.mp4'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find out which files have not been downloaded from AudioSet\n",
    "\n",
    "missing_files = {}\n",
    "missing_audio_files = {}\n",
    "missing_video_files = {}\n",
    "\n",
    "for subset_name in os.listdir(data_dir):\n",
    "    if not os.path.isdir(os.path.join(data_dir, subset_name)):\n",
    "        continue\n",
    "        \n",
    "    subset_path = os.path.join(dataset_dir, \"{}.csv\".format(subset_name))\n",
    "    subset_dir = os.path.join(data_dir, subset_name)\n",
    "    \n",
    "    missing_files[subset_name] = []\n",
    "    missing_audio_files[subset_name] = []\n",
    "    missing_video_files[subset_name] = []\n",
    "    \n",
    "    # Get the files that have been downloaded\n",
    "    local_subset_audio_files = set([os.path.splitext(fname)[0] for fname in os.listdir(os.path.join(subset_dir, 'audio'))])\n",
    "    local_subset_video_files = set([os.path.splitext(fname)[0] for fname in os.listdir(os.path.join(subset_dir, 'video'))])\n",
    "\n",
    "    # Get all files from the the subset csv files\n",
    "    with open(subset_path, 'r') as f:\n",
    "        subset_data = csv.reader(f)\n",
    "\n",
    "        for row_idx, row in enumerate(subset_data):\n",
    "            # Skip commented lines\n",
    "            if row[0][0] == '#':\n",
    "                continue\n",
    "            ytid, ts_start, ts_end = row[0], float(row[1]), float(row[2])\n",
    "            tms_start, tms_end = int(ts_start * 1000), int(ts_end * 1000)\n",
    "            media_filename = '{}_{}_{}'.format(ytid, tms_start, tms_end)\n",
    "            \n",
    "            missing_audio = media_filename not in local_subset_audio_files\n",
    "            missing_video = media_filename not in local_subset_video_files\n",
    "            \n",
    "            # Keep track of missing audio or video files\n",
    "            if missing_audio or missing_video:\n",
    "                missing_files[subset_name].append(row)\n",
    "                \n",
    "                # Keep track of the audio and videos separately for comparison\n",
    "                if missing_audio:\n",
    "                    missing_audio_files[subset_name].append(row)\n",
    "                if missing_video:\n",
    "                    missing_video_files[subset_name].append(row)\n",
    "                   \n",
    "    # Write a new csv containing only the YouTube video segments with missing files\n",
    "    missing_subset_path = os.path.join(dataset_dir, \"{}-missing.csv\".format(subset_name))\n",
    "    with open(missing_subset_path, 'w') as f:\n",
    "        writer = csv.writer(f)\n",
    "        writer.writerows(missing_files[subset_name])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eval_segments: Missing files for 578 YouTube videos\n",
      "unbalanced_train_segments: Missing files for 375165 YouTube videos\n",
      "balanced_train_segments: Missing files for 657 YouTube videos\n"
     ]
    }
   ],
   "source": [
    "# Get the number of YouTube videos with missing files for each subset\n",
    "for subset, missing_file_list in missing_files.items():\n",
    "    print(\"{}: Missing files for {} YouTube videos\".format(subset, len(missing_file_list)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eval_segments: 2 YouTube videos with audio but no video\n",
      "unbalanced_train_segments: 562 YouTube videos with audio but no video\n",
      "balanced_train_segments: 2 YouTube videos with audio but no video\n"
     ]
    }
   ],
   "source": [
    "# Get the number of YouTube videos with audio but no video for each subset\n",
    "for subset in missing_files.keys():\n",
    "    audio_no_video = list(set(map(tuple, missing_audio_files[subset])) - set(map(tuple, missing_video_files[subset])))\n",
    "    print(\"{}: {} YouTube videos with audio but no video\".format(subset, len(audio_no_video)))\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eval_segments: 2 YouTube videos with video but no audio\n",
      "unbalanced_train_segments: 532 YouTube videos with video but no audio\n",
      "balanced_train_segments: 4 YouTube videos with video but no audio\n"
     ]
    }
   ],
   "source": [
    "# Get the number of YouTube videos with video but no audio for each subset\n",
    "\n",
    "for subset in missing_files.keys():\n",
    "    video_no_audio = list(set(map(tuple, missing_video_files[subset])) - set(map(tuple, missing_audio_files[subset])))\n",
    "    print(\"{}: {} YouTube videos with video but no audio\".format(subset, len(video_no_audio)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
